---
title: "Replication_Script"
output: html_document
date: "2024-01-23"
---

# Readme

This is the replication script for the paper "The Role of Topic Expertise for Interruption Behavior in Parliament" by Julius Diener.

In the section "Main Analysis" all the code is provided to replicate all analyses in the paper, excluding the classification of speeches using the ParlBERT model and the fitting of the topic model, which can be found at the end. 

If there are any questions regarding the analysis, please reach out to me via e-mail (julius.diener@uni-mannheim.de).

IMPORTANT: Several parts of this code (Network Models, Classification, Topic Model) may take multiple hours to days to run. I thus include the saved outputs in the replication files. The code to create these outputs is commented out. If you wish to run the code yourself, un-comment the code. The script for classifying the speeches using ParlBERT is in a separate script. Using this script requires a working python distribution.

# Setup

```{r setup, include=FALSE}
# This is the setupchunk, it loads all important packages etc
knitr::opts_chunk$set(echo = TRUE)

p_needed <-
  c("viridis", "stargazer", "MASS", "haven", "ggplot2", "tidyverse", "rvest", "quanteda", "rmarkdown", "knitr", "readtext", "quanteda.textmodels", "quanteda.textstats", "quanteda.textplots", "lubridate", "stm", "readODS", "janitor", "cowplot", "stringr", "tm", "legislatoR", "keyATM", "margins")


packages <- rownames(installed.packages())

p_to_install <- p_needed[!(p_needed %in% packages)]

if (length(p_to_install) > 0) {
  install.packages(p_to_install)
}

sapply(p_needed, require, character.only = TRUE)

stargazer_opt <- ifelse(knitr::is_latex_output(), "latex", "html")
```

# Main Analysis:

## Load Data

```{r}
load("dyads_final.RData")
```

## Regression models

### Logistic Regression

```{r}
# main model
dyads$bin_int <- ifelse(dyads$count > 0, 1, 0)
model1R_bin <- glm(bin_int ~ topic_similarity_BERT + OppoOppo + OppoGov + GovOppo + frontbencherSpeaker + firsttermInter + femaleInter + femaleSpeaker + samedistrict + n_sentSpeaker + sameday_speeches, family = binomial(link = "logit"), data = dyads)
summary(model1R_bin)

# robustness model
model1R_bin_rob <- glm(bin_int ~ topic_similarity_keyATM + OppoOppo + OppoGov + GovOppo + frontbencherSpeaker + firsttermInter + femaleInter + femaleSpeaker + samedistrict + n_sentSpeaker + sameday_speeches, family = binomial(link = "logit"), data = dyads)
summary(model1R_bin_rob)
```

### AMEN models

On my work laptop, fitting these models takes ~10 hours. If just want to see the outputs, load the supplied output files as seen below.

#### Setup

```{r}
# # This chunk sets up the necessary data structure for the AMEN models
# 
# Dependent variables
# Y <- xtabs(count ~ fullnameInter + fullnameSpeaker, dyads)
# names(dimnames(Y)) <- NULL
# 
# dyads$bin_int <- ifelse(dyads$count > 0, 1, 0)
# 
# Y_bin <- xtabs(bin_int ~ fullnameInter + fullnameSpeaker, dyads)
# names(dimnames(Y_bin)) <- NULL
# 
# # Edge Characteristics
# Xd_simil <- xtabs(topic_similarity_BERT ~ fullnameInter + fullnameSpeaker, dyads, addNA = T)
# names(dimnames(Xd_simil)) <- NULL
# 
# Xd_simil_keyATM <- xtabs(topic_similarity_keyATM ~ fullnameInter + fullnameSpeaker, dyads, addNA = T)
# names(dimnames(Xd_simil_keyATM)) <- NULL
# 
# Xd_OppoOppo <- xtabs(OppoOppo ~ fullnameInter + fullnameSpeaker, dyads, addNA = T)
# names(dimnames(Xd_OppoOppo)) <- NULL
# 
# Xd_OppoGov <- xtabs(OppoGov ~ fullnameInter + fullnameSpeaker, dyads, addNA = T)
# names(dimnames(Xd_OppoGov)) <- NULL
# 
# Xd_GovOppo <- xtabs(GovOppo ~ fullnameInter + fullnameSpeaker, dyads, addNA = T)
# names(dimnames(Xd_GovOppo)) <- NULL
# 
# Xd_sd <- xtabs(samedistrict ~ fullnameInter + fullnameSpeaker, dyads, addNA = T)
# names(dimnames(Xd_sd)) <- NULL
# 
# Xd_sameday <- xtabs(sameday_speeches ~ fullnameInter + fullnameSpeaker, dyads, addNA = T)
# names(dimnames(Xd_sameday)) <- NULL
# 
# Xd <- array(c(Xd_simil, Xd_OppoOppo, Xd_OppoGov, Xd_GovOppo, Xd_sd, Xd_sameday), dim = c(dim(Xd_simil), 6))
# Xd_rob <- array(c(Xd_simil_keyATM, Xd_OppoOppo, Xd_OppoGov, Xd_GovOppo, Xd_sd, Xd_sameday), dim = c(dim(Xd_simil), 6))
# 
# # Node Characteristics
# Xr <- dyads %>% select(fullnameInter, firsttermInter, femaleInter)
# Xr <- unique(Xr)
# Xr <- Xr %>% arrange(fullnameInter)
# Xr <- Xr %>% select(firsttermInter, femaleInter)
# Xr <- as.matrix(Xr)
# rownames(Xr) <- c(1:714)
# 
# Xc <- dyads %>% select(fullnameSpeaker, frontbencherSpeaker, femaleSpeaker, n_sentSpeaker)
# Xc <- unique(Xc)
# Xc <- Xc %>% arrange(fullnameSpeaker)
# Xc <- Xc %>% select(frontbencherSpeaker, femaleSpeaker, n_sentSpeaker)
# Xc <- as.matrix(Xc)
# rownames(Xc) <- c(1:714)
```

#### Model fitting



```{r}
# main model
# fitSRRM_bin_RR <-ame(Y_bin, Xdyad=Xd, Xrow=Xr, Xcol=Xc, family="bin")
# 
# save(fitSRRM_bin_RR, file = "fitSRRM_bin_RR.RData")

load("fitSRRM_bin_RR.RData")

# robustness
# fitSRRM_bin_RR_rob <-ame(Y_bin, Xdyad=Xd_rob, Xrow=Xr, Xcol=Xc, family="bin")
# 
# save(fitSRRM_bin_RR_rob, file = "fitSRRM_bin_RR_rob.RData")

load("fitSRRM_bin_RR_rob.RData")

# Summary
summary(fitSRRM_bin_RR)

summary(fitSRRM_bin_RR_rob)
```

## Marginal effects

```{r}
margins_1 <- margins(model1R_bin)

margins_sum <- summary(margins_1)

#plot(margins_1)

quant_diff <- as.numeric(quantile(dyads$topic_similarity_BERT, c(0.9), na.rm = T) - quantile(dyads$topic_similarity_BERT, c(0.5), na.rm = T))

# creating the AME for the reduced step in topic similarity
margins_sum$plotAME <- margins_sum$AME
margins_sum[margins_sum$factor == "topic_similarity_BERT","plotAME"] <- margins_sum[margins_sum$factor == "topic_similarity_BERT","AME"] * quant_diff
margins_sum[margins_sum$factor == "topic_similarity_BERT","lower"] <- margins_sum[margins_sum$factor == "topic_similarity_BERT","lower"] - (margins_sum[margins_sum$factor == "topic_similarity_BERT","AME"] - margins_sum[margins_sum$factor == "topic_similarity_BERT","plotAME"])
margins_sum[margins_sum$factor == "topic_similarity_BERT","upper"] <- margins_sum[margins_sum$factor == "topic_similarity_BERT","upper"] - (margins_sum[margins_sum$factor == "topic_similarity_BERT","AME"] - margins_sum[margins_sum$factor == "topic_similarity_BERT","plotAME"])

margins_plot <- margins_sum %>% filter(factor %in% c("topic_similarity_BERT", "OppoOppo", "OppoGov", "GovOppo", "firsttermInter"))
margins_plot$loc <- c(1:5)
```

```{r}
ggplot(data = margins_plot) +
  geom_point(aes(x = loc, y = plotAME), size = 2) +
  geom_linerange(aes(x = loc, ymin = lower, ymax = upper)) +
  scale_x_continuous(limits = c(0.8, 5.2),
                     breaks = c(1:5),
                     labels = c("First Term Interrupter", "Government -> Opposition",  "Opposition -> Government", "Opposition -> Opposition", "Topic Similarity (Median -> 90th Percentile)")) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip() +
  theme_bw() +
  labs(x = "",
       y = "Average Marginal Effect",
                     caption = "Based on Model 1
                     Only variables with consistent effects across models
                     Baseline for Government-Opposition constellation are Government -> Government dyads")

ggsave(filename = "AMEs.jpeg", width = 9, height = 5)
```



# Topic Modeling

This section shows how the topic model used to create the topic similarity index was created. Since this model can take multiple hours to run, the first subsection of this section will simply load the output of the model and display a summary of the output. If you are interested, the following sections show how the topic model was fit and the similarity index calculated.

## Loading output

```{r}
load("out1_ex.RData")

top_words(out1_ex, n = 20)
```


## Topic Model Fitting

### Data Preparation

```{r}
# # loading the speeches
# load("speeches19_rep.RData")
# 
# # creating the corpus
# corpus19 <- corpus(speeches19, text_field = "speechContent")
# 
# # Extracting tokens and removing punctuation, numbers and stopwords that have no substantive meaning
# tokens19 <- tokens(corpus19,
#                     remove_numbers = T,
#                     remove_punct = T) %>%
#   tokens_tolower() %>%
#   tokens_remove(c(stopwords("de"), "dass", "damen", "herren"))
# 
# # creating the dfm, removing infrequent terms. Eshima et al. Suggest no more than 10000 unique terms so I use stronger trim restrictions. With this large and diverse of a corpus this is not achievable, but necessitates stronger trimming
# dfm19 <- dfm(tokens19) %>% dfm_trim(min_termfreq = 25, min_docfreq = 10)
# 
# ncol(dfm19)
# 
# # removing empty documents
# dfm19 <- dfm_subset(dfm19, ntoken(dfm19) > 0)
# 
# # reading everything into keyatm
# keyATM_docs <- keyATM_read(texts = dfm19, encoding = "UTF-8")
```

### Keyword Preparation

```{r}
# # creating the list of keywords
# keywords_ex <- list(covid = c("impfen", "masken", "pandemie", "schnelltests", "corona"),
#                  macroeconomics = c("wirtschaft", "unternehmen"),
#                  civil_rights = c("frauen", "gleichgeschlechtliche", "gleichberechtigung"),
#                  health = c("pflege", "patientinnen", "patienten", "krankenkassen"),
#                  labour = c("arbeit", "mindestlohn", "arbeitsmarkt"),
#                  immigration = c("integration", "migration", "flÃ¼chtlinge"),
#                  education = c("bildung", "schulen"),
#                  climate = c("klimaschutz", "kohleausstieg", "erneuerbaren", "co"),
#                  transport = c("bahn", "autos", "pkw", "verkehr"),
#                  law_crime = c("polizei", "kriminelle"),
#                  family = c("kinder", "familien", "eltern"),
#                  social_welfare = c("hartz", "rente", "arbeitslose"),
#                  banking_finance = c("banken", "finanzmÃ¤rkte", "ezb"),
#                  defence = c("bundeswehr", "soldaten"),
#                  science_tech = c("wissenschaft", "forschung", "digitalisierung"),
#                  international = c("entwicklungspolitik", "europa", "menschenrechte"),
#                  government_operations = c("haushalt", "bund", "kommunen"),
#                  housing = c("wohnungen", "miete", "bauen"),
#                  agriculture = c("landwirtschaft", "bauern"))
```

### Fitting the model

```{r}
# out1_ex <- keyATM(docs = keyATM_docs,
#                no_keyword_topics = 5,
#                keywords = keywords_ex,
#                model = "base",
#                options = list(seed = 28032023))
```

### creating topic similarity index

```{r}
# # this chunk matches the theta matrix to politicians and calculates the averages per politician
# 
# theta1_ex <- out1_ex$theta
# 
# colnames(theta1_ex) <- c("covid", "macroeconomics", "civil_rights", "health", "labour", "immigration", "education", "climate", "transport", "law_crime", "family", "social_welfare", "banking_finance", "defence", "science_tech", "international", "government_operations", "housing", "agriculture", "other1", "other2", "other3", "other4", "other5")
# 
# dfm19_data <- convert(dfm19, to = "data.frame")
# 
# doc_id <- dfm19_data$doc_id
# 
# theta1_ex <- cbind(doc_id, theta1_ex)
# 
# corp_data <- convert(corpus19, to = "data.frame")
# 
# corp_data <- merge(corp_data, theta1_ex, by = "doc_id")
# 
# corp_data$fullnameSpeaker <- paste(corp_data$firstNameSpeaker, corp_data$lastNameSpeaker, sep= " ")
# 
# 
# # converting all theta variables to numeric
# corp_data$covid <- as.numeric(corp_data$covid)
# corp_data$macroeconomics <- as.numeric(corp_data$macroeconomics)
# corp_data$civil_rights <- as.numeric(corp_data$civil_rights)
# corp_data$health <- as.numeric(corp_data$health)
# corp_data$labour <- as.numeric(corp_data$labour)
# corp_data$immigration <- as.numeric(corp_data$immigration)
# corp_data$education <- as.numeric(corp_data$education)
# corp_data$climate <- as.numeric(corp_data$climate)
# corp_data$transport <- as.numeric(corp_data$transport)
# corp_data$law_crime <- as.numeric(corp_data$law_crime)
# corp_data$family <- as.numeric(corp_data$family)
# corp_data$social_welfare <- as.numeric(corp_data$social_welfare)
# corp_data$banking_finance <- as.numeric(corp_data$banking_finance)
# corp_data$defence <- as.numeric(corp_data$defence)
# corp_data$science_tech <- as.numeric(corp_data$science_tech)
# corp_data$international <- as.numeric(corp_data$international)
# corp_data$government_operations <- as.numeric(corp_data$government_operations)
# corp_data$housing <- as.numeric(corp_data$housing)
# corp_data$agriculture <- as.numeric(corp_data$agriculture)
# corp_data$other1 <- as.numeric(corp_data$other1)
# corp_data$other2 <- as.numeric(corp_data$other2)
# corp_data$other3 <- as.numeric(corp_data$other3)
# corp_data$other4 <- as.numeric(corp_data$other4)
# corp_data$other5 <- as.numeric(corp_data$other5)
# 
# # mean of thetas for each speaker
# theta_sum_speaker <- corp_data %>% group_by(fullnameSpeaker) %>% summarize(covid_speaker = mean(covid), macroeconomics_speaker = mean(macroeconomics), civil_rights_speaker = mean(civil_rights), health_speaker = mean(health), labour_speaker = mean(labour), immigration_speaker = mean(immigration), education_speaker = mean(education), climate_speaker = mean(climate), transport_speaker = mean(transport), law_crime_speaker = mean(law_crime), family_speaker = mean(family), social_welfare_speaker = mean(social_welfare), banking_finance_speaker = mean(banking_finance), defence_speaker = mean(defence), science_tech_speaker = mean(science_tech), international_speaker = mean(international), government_operations_speaker = mean(government_operations), housing_speaker = mean(housing), agriculture_speaker = mean(agriculture), other1_speaker = mean(other1), other2_speaker = mean(other2), other3_speaker = mean(other3), other4_speaker = mean(other4), other5_speaker = mean(other5))
# 
# # mean of thetas for each inter
# theta_sum_inter <- corp_data %>% group_by(fullnameSpeaker) %>% summarize(covid_inter = mean(covid), macroeconomics_inter = mean(macroeconomics), civil_rights_inter = mean(civil_rights), health_inter = mean(health), labour_inter = mean(labour), immigration_inter = mean(immigration), education_inter = mean(education), climate_inter = mean(climate), transport_inter = mean(transport), law_crime_inter = mean(law_crime), family_inter = mean(family), social_welfare_inter = mean(social_welfare), banking_finance_inter = mean(banking_finance), defence_inter = mean(defence), science_tech_inter = mean(science_tech), international_inter = mean(international), government_operations_inter = mean(government_operations), housing_inter = mean(housing), agriculture_inter = mean(agriculture), other1_inter = mean(other1), other2_inter = mean(other2), other3_inter = mean(other3), other4_inter = mean(other4), other5_inter = mean(other5))
# 
# theta_sum_inter <- rename(theta_sum_inter, fullnameInter = fullnameSpeaker)
# 
# # now merging the thetas in
# dyads <- merge(dyads, theta_sum_speaker, by = "fullnameSpeaker", all.x = T)
# dyads <- merge(dyads, theta_sum_inter, by = "fullnameInter", all.x = T)
# 
# # calculating similarity index
# dyads <- dyads %>% mutate(topic_similarity = (1/2) * (2 - (abs(covid_speaker - covid_inter) +
#                                                    abs(macroeconomics_speaker - macroeconomics_inter) +
#                                                    abs(civil_rights_speaker - civil_rights_inter) +
#                                                    abs(health_speaker - health_inter) +
#                                                    abs(labour_speaker - labour_inter) +
#                                                    abs(immigration_speaker - immigration_inter) +
#                                                    abs(education_speaker - education_inter) +
#                                                    abs(climate_speaker - climate_inter) +
#                                                    abs(transport_speaker - transport_inter) +
#                                                    abs(law_crime_speaker - law_crime_inter) +
#                                                    abs(family_speaker - family_inter) +
#                                                    abs(social_welfare_speaker - social_welfare_inter) +
#                                                    abs(banking_finance_speaker - banking_finance_inter) +
#                                                    abs(defence_speaker - defence_inter) +
#                                                    abs(science_tech_speaker - science_tech_inter) +
#                                                    abs(international_speaker - international_inter) +
#                                                    abs(government_operations_speaker - government_operations_inter) +
#                                                    abs(housing_speaker - housing_inter) +
#                                                    abs(agriculture_speaker - agriculture_inter) +
#                                                    abs(other1_speaker - other1_inter) +
#                                                    abs(other2_speaker - other2_inter) +
#                                                    abs(other3_speaker - other3_inter) +
#                                                    abs(other4_speaker - other4_inter) +
#                                                    abs(other5_speaker - other5_inter))))
```










